Dr. Yang Ye <Email:yy@runchee.com>
Nov 09, 2017
Let’s review some R basics.
Vector/Matrix/List/Dataframe
# Create a vector from number
v <- c(1, 3)
v[1] <- 3
v
## [1] 3 3# repeat 100 for 10 times.
rep(100, 10)
## [1] 100 100 100 100 100 100 100 100 100 100# create matrix of 10x10
mat <- matrix(2, 3, 4)
mat
## [,1] [,2] [,3] [,4]
## [1,] 2 2 2 2
## [2,] 2 2 2 2
## [3,] 2 2 2 2
# set first row to 4
mat[1,] <- 4
# set element (2, 2) to 6
mat[2, 2] <- 6which()match()%in%data <- 10:1
match(c(1, 3), data)
## [1] 10 8
data[match(c(1, 3), data)]
## [1] 1 3
which(1 == data | 3 == data)
## [1] 8 10
data[which(1== data | 3 == data)]
## [1] 3 1match(c(11, 31), 10:1)
## [1] NA NA
which(11== 10:1 | 31 == 10:1)
## integer(0)if (all(c(1, 33) %in% 1:3)) {
cat("Found all\n")
}
if (any(c(1, 33) %in% 1:3)) {
cat("Found one/some.\n")
}
## Found one/some.# Norm random number
rnorm(3, mean = 10, sd = 3)
## [1] 6.575252 11.226616 13.571712# Uniform random number
runif(3)
## [1] 0.2201473 0.1328510 0.3524160# Sample
sample(1:10, 10, replace = F)
## [1] 8 4 5 9 1 6 7 3 2 10
# To Be/Not to Be
sample(c(T, F), 10, replace = T)
## [1] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
# Throw a dice
sample(1:6, 10, replace = T)
## [1] 5 4 5 6 4 4 4 2 5 4cat(paste0(..., "\n")) is what I used most."\n" is appended to the end to create a line break.paste0/paste can use to create new strings from any data types.paste0 combines any thing without space. paste uses space, by default.paste0/paste with collapse helps with vector to print them in one line.paste0/paste works with all types of data.x <- c(Sys.Date(), Sys.Date(), Sys.Date())
cat(paste0("Current dates is ", x, ".\n"))
## Current dates is 2017-11-08.
## Current dates is 2017-11-08.
## Current dates is 2017-11-08.
cat(paste0("Current dates is ", paste0(x, collapse = ", "), ".\n"))
## Current dates is 2017-11-08, 2017-11-08, 2017-11-08.# sub-string
# substr(x, start, stop)
substr("The fox jumps.", 6, 6 + 5 - 1)
## [1] "ox ju"# paste0/paste to concatenate string/convert to string
new_string <- paste0("This is ", "cat")
new_string <- paste0("This is ", "cat", sep = "a")
new_string <- paste0(1:3, sep = "a")# toupper/tolower
toupper("big")
## [1] "BIG"
tolower("LOWER")
## [1] "lower"# grepl: Find, returns T or F
grepl("A", "ABC", fixed = T)
## [1] TRUE
grepl("D", "ABC", fixed = T)
## [1] FALSE# sub: replace for one time
# sub(pattern, replace, string,...)
# fixed = T means use fixed string. Not regular expression
sub("D", "ABC", "DDD", fixed = T)
## [1] "ABCDD"
# gsub: replace for all
gsub("D", "ABC", "DDD", fixed = T)
## [1] "ABCABCABC"If you start to use regular expression, sub/grepl becomes super powerful.
# If we need to find `Start` appearing the beginning of the string
grepl("^Start", "Start with me")
## [1] TRUE
grepl("^Start", "me Start")
## [1] FALSE# To find something in the end
sub("X$", "Z", "XYZ ends with X")
## [1] "XYZ ends with Z"sub("[^\\_]+\\_.*", "", "USDCNY_M1")
## [1] ""[^\\_]: Character not containing _. Because _ is a special character, we quote it with two backslashes.+: One or more.: Any character*: none or more.# Rough cut
sub("([^\\_]+)\\_.*", "\\1", "USDCNY_M1")
## [1] "USDCNY"# Nice cut
sub("([^\\_]+)\\_(.*)", "\\1 \\2", "USDCNY_M1")
## [1] "USDCNY M1"# Wonderful cut
sub("([^\\_]+)\\_([[:alpha:]])([[:digit:]])", "\\1 \\2 \\3", "USDCNY_M1")
## [1] "USDCNY M 1"Cheatsheat is available at https://www.rstudio.com/resources/cheatsheets/
# Create date
dt1 <- as.Date("2017-11-03")
dt1
## [1] "2017-11-03"
dt2 <- Sys.Date()
dt2
## [1] "2017-11-08"library(lubridate)# Date is such a central role in finance.
# More function about date can be found in package `lubridate`
# Create date with lubridate, a package which provides lots of date functions.
ymd(20171003)
## [1] "2017-10-03"
ymd("20171003")
## [1] "2017-10-03"We can use codes for convert date to/from string.
format(Sys.Date(), format = "%Y/%m/%d")
## [1] "2017/11/08"as.Date("2017-11-03", format = "%Y-%m-%d") # %m for number month
## [1] "2017-11-03"
as.Date("2017-Nov-03", format = "%Y-%b-%d") # %b for the 3-letter month
## [1] "2017-11-03"
as.Date("03Nov2017", format = "%d%b%Y")
## [1] "2017-11-03"lubridatelibrary(lubridate)
# Change a date
x <- as.Date("2017-10-10")
month(x) <- 1
x
## [1] "2017-01-10"# Set to the end of the month
day(x) <- days_in_month(x)Use package bizdays
# install.packages("bizdays")
library(bizdays)# 'weekends' is a calendar of weekdays
bizdays("2017-10-16", "2017-10-30", "weekends")
## [1] 10
# add bizdays
add.bizdays("2017-11-03", 5, "weekends")
## [1] "2017-11-10"
# generate all business days between.
# You will find this useful for later financial application.
bizseq("2017-11-10", "2017-12-01", "weekends")
## [1] "2017-11-10" "2017-11-13" "2017-11-14" "2017-11-15" "2017-11-16"
## [6] "2017-11-17" "2017-11-20" "2017-11-21" "2017-11-22" "2017-11-23"
## [11] "2017-11-24" "2017-11-27" "2017-11-28" "2017-11-29" "2017-11-30"
## [16] "2017-12-01"If not provided, start.date is by default the first holiday and end.date is the last holiday. So we provide them here.
create.calendar(name="Singapore", holidays = c(as.Date("2017-10-18")),
start.date = as.Date("2017-01-01"), end.date = as.Date("2071-01-01"),
weekdays = c("saturday", "sunday"))
bizdays("2017-10-13", "2017-10-30", "weekends")
## [1] 11
# One day less
bizdays("2017-10-13", "2017-10-30", "Singapore")
## [1] 10Input parameters
func1 <- function() { }
func2 <- function(input1, input2) { }
# Param input1 is default to 1
func3 <- function(input1 = 1, input2) { }
func4 <- function(input1, input_type = c("int", "char"))
{
# This would check wheher input_type is set to one of the pre-set values.
input_type = match.arg(input_type)
}
func5 <- function(in1, in2) {
if (in1 < 0) {
return(0)
} else {
return(in1 + in2)
}
}
# The last value before function finishes will be returned automatically. No need to use return.
func5 <- function(in1, in2) {
if (in1 < 0) {
0
} else {
in1 + in2
}
}
# Unless there is extra steps before
func6 <- function(in1, in2) {
if (in1 < 0) {
return(0) # if we have 0 here, it's not the last step before function exits.
} else {
res <- in1 + in2
}
res <- res * 3
res
}Write functions to do
Convert time to character/string
format(Sys.time(), format = "%H%M")
## [1] "0043"
format(Sys.time(), format = "%H:%M:%S")
## [1] "00:43:24"
format(Sys.time(), format = "%H:%M:%S")
## [1] "00:43:24"
library(lubridate)
ymd_hms("2011-12-31 12:59:59")
## [1] "2011-12-31 12:59:59 UTC"Change time, lubridate provides hour, minute
x <- Sys.time()
x
## [1] "2017-11-08 00:43:24 +08"
hour(x) <- 12
x
## [1] "2017-11-08 12:43:24 +08"
minute(x) <- 3
x
## [1] "2017-11-08 12:03:24 +08"
minute(x) <- 123 # what will happen?
x
## [1] "2017-11-08 14:03:24 +08"# Create list with list() function
# Nameless list
# list[_n_] => item by order
a <- list(3, 4)
a[[1]]
## [1] 3
a[[2]]
## [1] 4
# Named list, you can use $ and [ operators
# list[[]]: gives back a value
# list$name => list[["name"]]
a <- list(a = 3, b = 4)
a[[1]]
## [1] 3
a[[2]]
## [1] 4
a[["a"]]
## [1] 3
a$a
## [1] 3# When you want to use a number as key, use backtick
list_of_strikes <- list()
list_of_strikes$`65` <- 3
list_of_strikes$`60` <- 4
# if a name doesn't exist in the list
a$c
## NULL
# Use `is.null()` to check
if (is.null(a$c)) {
cat("c doesn't exist in list a\n")
}
## c doesn't exist in list a# List can be used as map/dictionary.
# Map
basket <- sample(c("Apple", "Orange", "Pear"), 100, replace = T)
fruit_count <- list()
for (b in basket) {
if (is.null(fruit_count[[b]])) {
fruit_count[[b]] <- 1
} else {
fruit_count[[b]] <- fruit_count[[b]] + 1
}
}
fruit_count
## $Pear
## [1] 33
##
## $Orange
## [1] 30
##
## $Apple
## [1] 37# Let's write a generic function to do this
add_to_map <- function(map, key, value) {
if (is.null(map[[key]])) {
map[[key]] <- value
} else {
map[[key]] <- map[[key]] + value
}
map
}
# You may copy function add_to_map to every file that you want to use this kind of dictionary
fruit_count <- add_to_map(fruit_count, "Pomelo", 12)
fruit_count
## $Pear
## [1] 33
##
## $Orange
## [1] 30
##
## $Apple
## [1] 37
##
## $Pomelo
## [1] 12# Use case 1: Use list to pass data in or out.
do_lots_of_work <- function(a, b, c) {
}
# pass in
do_lots_of_work <- function(lst) {
lst$a + lst$b
}
# pass out
ret_lots_of_work <- function() {
return(list(a = a, b = b))
}
res <- ret_lots_of_work()
res$a
## $a
## [1] 3
##
## $b
## [1] 4
res$b
## [1] "Apple"# Case 2: configuration
app_config <- list(MAX = 10, MIN = 10, DISPLAY_RESULT = T)
do_lots_of_work <- function(app_config) {
app_config$MAX
}# Object
# Define class with attributes.
vanilla_option <- setClass("vanilla_option",
slots = c(type = "character",
strike = "numeric",
underlying = "numeric"))
# Create object, either way
opt1 <- new("vanilla_option", type = "c", strike = 100, underlying = 100)
opt2 <- vanilla_option(type = "c", strike = 100, underlying = 100)
# Use @ to visit member. or,
opt1@type
## [1] "c"
slot(opt1, "strike")
## [1] 100# Generate a vector of options
opts <- sapply(1:10000, function(x) {
vanilla_option(type = sample(c("c", "p"), 1),
strike = round(runif(1) * 100, 0),
underlying = round(runif(1) * 100, 0)) } )
# install.packages("fOptions")
library(fOptions)
start <- Sys.time()
# GBSOption also returns an object. We just need its price attribute.
res1 <- sapply(opts, function(o) {
(GBSOption(o@type, o@underlying, o@strike, Time = 1,
r = 0.01, b = 0, sigma = 0.3))@price
})
cat(as.numeric(Sys.time() - start))
## 2.713003
head(res1, n = 4)
## [1] 3.597272e-04 3.960199e+01 1.130114e-01 3.231863e+00
# Alternatively to sapply, we can use map* functions from purrr package
# map is a generic function that returns a list
# map_dbl is for result of double, it would return a vector
res2 <- purrr::map_dbl(opts, function(o) {
(GBSOption(o@type, o@underlying, o@strike, Time = 1,
r = 0.01, b = 0, sigma = 0.3))@price
})
head(res2, n = 4)
## [1] 3.597272e-04 3.960199e+01 1.130114e-01 3.231863e+00# Function that's defined in-place, which doesnt' need to have a name.
(function(x) { print(x) }) (3)
## [1] 3
# if there is only one line, you can skip { }
(function(x) print(x)) (3)
## [1] 3
# For longer functions, you can make it multi-lines.
(function(x) {
if (x > 3) {
print(x)
} else {
print(x - 3)
}
})(3)
## [1] 0# These two are equivalent.
res1 <- purrr::map(1:10, function(x) { rnorm(x, n = 10) })
# function(x) func(x) can be simplied as func.
res2 <- purrr::map(1:10, rnorm, n = 10)
head(res1, n = 1)
## [[1]]
## [1] 2.04172131 1.15437251 0.48023393 -0.20466246 1.56425258
## [6] 2.76269333 2.05444062 2.54427530 -0.04313395 1.68816826
# purrr:map returns a list()# This is what we really want to do. Generate ten normal distribution and get their mean.
# rnorm(n, mean = 0, sd = 1). Where doesn't input go to?
res <- purrr::map(1:10, rnorm, n = 1000)
map_dbl(res, mean)
## [1] 0.9938541 1.9212187 3.0139946 4.0068545 4.9709724 5.9927339 6.9607755
## [8] 7.9686268 8.9490252 9.9611648
# sapply achieves the same as purrr::map, a bit slower.
# Package purrr succeeds original R base.
sapply(1:10, function(x) x ^ 2 )
## [1] 1 4 9 16 25 36 49 64 81 100
sapply(1:10, function(x) `^`(x, 2) )
## [1] 1 4 9 16 25 36 49 64 81 100
sapply(1:10, function(x) `^`(2, x) )
## [1] 2 4 8 16 32 64 128 256 512 1024# set working directory
setwd("C:/TEMP")
# Save this_is_var1 to a file
saveRDS(this_is_var1, file = "C:/TEMP/DATA/data.Rds")
# Load a variable from a file. `new_loaded` is the name given to it.
new_loaded <- readRDS(file = "C:/TEMP/DATA/data.Rds")
\\ or single backslash /. e.g. C:\\TEMP\\DATA, C:/TEMP/DATA/Users/.../The basic structure of a data frame:
df <- data_frame(
date = seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"),
stock = replicate(10, paste0(sample(LETTERS, 3, replace = T), collapse = "")),
quantity = round(runif(10) * 10000 ,0))
# df["date"]: gives a data frame
# df[["date"]]: gives value
# df$date: same as [["date"]]
# Get three rows
df[c(3, 6, 9), , drop = F]# Get three columns
df[, 1, drop = F]# This would return a vector
df[, 1, drop = T]
## Warning: drop ignored
# Use column names
df[, c("date", "quantity"), drop = F]Common functions for data frame
View()
head()
tail()
str()
nrow()
ncol()
dim() # returns both nrow and ncol
colnames()/rownames()
## [1] 14
## [1] 101
## [1] 37.084
## res_sim: 2.9431
## res_ana: 2.92896825396825
library(shiny)
ui <- fluidPage("Hello World")
server <- function(input, output, session) { }
shinyApp(ui = ui, server = server)
ui <- fluidPage(
titlePanel("Hello World with a Histogram"),
# Input() functions
numericInput("num", "Number of Sample", value = 30),
# Output() functions
plotOutput("hist")
)
All input function follow such function signature except for input-specific parameters.
inputXXX(inputId = "input name", label = "label to display", ...)
All output function follow such pattern.
yyyOutput(outputId = "output name")
plotOutput: I suggest to set width and height to fixed size so we need extra parameters. For others, outputId is good enough.
Sever is to fill the content of output
server <- function(input, output, session) {
# Enable either one of two
output$hist <- renderPlot({ hist(rnorm(100)) })
if (FALSE) {
output$hist <- renderPlot({
title("a normal random number histogram")
hist(rnorm(input$num))
})
}
}
UI and Server combines to be a ShinyApp. UI is to run the same for each browser/client. Server is separate between different users.
shinyApp(ui, server)
input$num ------> output$p1Reactive values work together with reactive functions.
input$x => output$yinput$x => expression() => output$yReactivity is enabled by placing inputXXX in renderXXX function.
library(shiny)
ui <- fluidPage(
numericInput("num", "Num", 100),
# numericInput("mean", "Mean", 5),
# numericInput("sd", "SD", 3),
numericInput("lambda", "Lambda", 1),
plotOutput("p1")
)
server <- function(input, output, session) {
output$p1 <- renderPlot({
# hist(rnorm(input$num, mean = input$mean, sd = input$sd))
hist(rpois(n = input$num, lambda = input$lambda))
})
}
shinyApp(ui, server)
observeEvent to observe button action, and isolate to cut down the link of inputXXX in renderXXX, so button can work.isolate?library(shiny)
ui <- fluidPage(
numericInput("num", "Num", 10),
actionButton("go", "Go"),
plotOutput("p1")
)
server <- function(input, output, session) {
observeEvent(input$go, {
output$p1 <- renderPlot({
hist(rnorm(isolate(input$num)))
})
})
}
shinyApp(ui, server)
We can add a reactiveValue with eventReactive
library(shiny)
ui <- fluidPage(
numericInput("num", "Num", 10),
actionButton("go", "Go"),
plotOutput("p1")
)
server <- function(input, output, session) {
data <- eventReactive(input$go, {
hist(rnorm(input$num))
})
output$p1 <- renderPlot({ data() })
}
shinyApp(ui, server)
For tableOutput
output$t1 <- renderTable(iris)
output$t1 <- renderTable({
some input..
output is a data frame.
})
For dataTableOutput (Dynamic table)
output$t2 <- renderDataTable(iris)
For plotOutput
output$p2 <- renderPlot({ plot(runif(1000), runif(1000)) })
For textOutput and verbatimTextOutput
output$t3 <- renderText({ "foo" })
output$t4 <- renderPrint({
print("foo")
print("bar")
})
library(shiny)
library(DT)
ui <- fluidPage(
h3("t1"),
tableOutput("t1"),
hr(),
fluidRow(
column(9, h3("dt1"),
dataTableOutput("dt1")),
column(3, h3("x4"),
verbatimTextOutput("x4"))),
hr(),
fluidRow(
column(8, h3("dt2"),
dataTableOutput("dt2")),
column(4, h3("p5"),
plotOutput("p5")))
)
options(error = function() traceback(2))
server <- function(input, output, session) {
output$t1 <- renderTable(iris[1:10,], striped = T, hover = T)
output$dt1 <- renderDataTable(iris, options = list( pageLength = 5))
output$x4 <- renderPrint({
s = input$dt1_rows_selected
if (length(s)) {
cat('These rows were selected:\n\n')
cat(s, sep = ', ')
}
})
output$dt2 <- renderDataTable(iris,
options = list(pageLength = 5),
server = FALSE)
output$p5 <- renderPlot({
s <- input$dt2_rows_selected
plot(iris$Sepal.Length, iris$Sepal.Width)
if (length(s)) {
points(iris[s, c("Sepal.Length", "Sepal.Width"), drop = F],
pch = 19, cex = 1, col = "red")
}
})
}
shinyApp(ui, server)
Add a selectInput for different color names, returned from colors().
plot(1:10, pch = 19, cex = 1, col = “skyblue1”)
\(NPV = \frac{Cashflow 1}{(1 + yield)^1} + \frac{Cashflow 2}{(1 + yield)^2} + ... + \frac{Last Cashflow}{(1 + yield)^n}\)
For a Bond with fixed coupon \(Bond Price = Coupon * \frac{1 - (\frac{1}{(1 + yield)^n})}{yield} + \Big[MaturityValue * \frac{1}{(1 + yield)^n}\Big]\)
install.packages(“tidyverse”)
Let’s start from SQL, which first appeared in 1974; 43 years ago.
Data engineering was born around 70s with SQL.
# Select everything from Shops.
SELECT * FROM Shops;
# Select with a filter
SELECT * FROM Shops WHERE size = "Big";
# Select with a filter and order
SELECT * FROM Shops WHERE size = "Big" ORDER BY Name;
# Select with a filter, order, group and summary function `sum`
SELECT Region, sum(Sales) FROM Shops WHERE size = "Medium" GROUP BY Region;
# Insert a new record to Shops.
INSERT into Shops (Name, Region, Sales) VALUES ("Costco", "North", 123456, ...);
# Update a field
UPDATE Shops SET Sales = Sales + 1000 WHERE Name = "Costco";
# Delete from Shops with a filter
DELETE from Shops WHERE Sales < 1000
df <- data.frame(a = 1:10, b = 10:1)
# Filter:
df[which(df$a == 3 | df$b == 3), , drop = T] df[match(3, df$a), , drop = T]
## $a
## [1] 3
##
## $b
## [1] 8
df[, match("b", colnames(df)), drop = T]
## [1] 10 9 8 7 6 5 4 3 2 1
# Insert
rbind(df, df)
# Delete
df[-(which(df$a == 3 | df$b == 3)), , drop = T]
# Update
df[which(df$a == 3 | df$b == 3), 2] <- 3dplyr package from tidyverse is a high-performance package to deal with data frame.
# tidyverse is a bundle of packages.
# I usually load them all with library(tidyverse, instead of library(dplyr) individually.
library(tidyverse)
# Loading tidyverse: ggplot2
# Loading tidyverse: tibble
# Loading tidyverse: tidyr
# Loading tidyverse: readr
# Loading tidyverse: purrr
# Loading tidyverse: dplyr
# Note:
# filter(): dplyr, stats
# lag(): dplyr, stats
# Use dplyr::lag and dplyr::filter when it doesn't work.dplyr provides functions in “verbs”, which is functions that does one thing only. We will learn to use the following.
I chose this data set of a Portuguese retail bank clients profile
variable = column = field
Use RStudio’s File -> Import Dataset, you may choose either “From Text (base)” or “From Text (readr)”. Either way loads the data.
base comes with R. readr is a package from tidyverse that provides more options and functionality. Copy the generated code to your script file.
I place it at https://goo.gl/fFQAAm (for Download), https://goo.gl/PBQnBt (for direct use).
You may download it and save it to local.
# Use base
bank <- read.csv("example/data-bank/bank.csv", sep=";") # or,
bank <- read.csv("https://goo.gl/PBQnBt", sep = ";")
# use readr
library(readr)
bank <- read_delim("example/data-bank/bank.csv",
";", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## age = col_integer(),
## job = col_character(),
## marital = col_character(),
## education = col_character(),
## default = col_character(),
## balance = col_integer(),
## housing = col_character(),
## loan = col_character(),
## contact = col_character(),
## day = col_integer(),
## month = col_character(),
## duration = col_integer(),
## campaign = col_integer(),
## pdays = col_integer(),
## previous = col_integer(),
## poutcome = col_character(),
## y = col_character()
## )View(bank)
selectselect(df, ...), … can be
- means exclude)select - Examplessubset <- select(bank, marital)
subset <- select(bank, 1)
subset <- select(bank, -1)
subset <- select(bank, -job)
subset <- select(bank, -(job:education))
subset <- select(bank, starts_with("p"))
subset <- select(bank, ends_with("p"))
subset <- select(bank, contains("p"))select as a re-arrangement of columns.job_first <- select(bank, job, everything())
bankfiltercolnames(bank)
## [1] "age" "job" "marital" "education" "default"
## [6] "balance" "housing" "loan" "contact" "day"
## [11] "month" "duration" "campaign" "pdays" "previous"
## [16] "poutcome" "y"
young <- dplyr::filter(bank, age < 40)
another_young <- dplyr::filter(bank, age < 20 & marital == "married")
just_young <- dplyr::filter(bank, age < 20 & marital == "single")
young2 <- dplyr::filter(bank, age >= 20 & age < 30)
another_young2 <- dplyr::filter(bank, age >= 20 & age < 30 & marital == "married")
just_young2 <- dplyr::filter(bank, age >= 20 & age < 30 & marital == "single")# %in% to match multiple
second_upper <- dplyr::filter(bank, education %in% c("tertiary", "secondary"))
# filter out NA value.
no_na <- dplyr::filter(bank, is.na(balance) | balance > 0)# rename(new name = old)
# Use tick to quote special strings.
df <- rename(bank, young_age = age)
df <- rename(bank, `Age in Bank` = age)# arrange is sort
arrange(bank, job)arrange(bank, default, job)
# descending for day
arrange(bank, desc(day))arrange(bank, desc(as.Date(day, format="%d", origin = Sys.Date())))NB: Missing values are always sorted at the end.
How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).
arrange(bank, !is.na(a), a)
Find the eldest?
# Replace existing
# ifelse is to check condition.
df1 <- mutate(bank, y = ifelse(y == "yes", T, F))
# Add a new column.
df2 <- mutate(bank, duration_diff = duration - mean(duration, na.rm = TRUE))
# case_when is a function to deal multiple choices.
mutate(bank, age_group = case_when(
age < 20 ~ "youth",
age < 40 ~ "middle-age",
age < 50 ~ "senior",
TRUE ~ "happy"
))
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
# month.abb is a built-in array of month names.
df3 <- mutate(bank, month_name = factor(firstup(as.character(month)), levels = month.abb))
# transmute would remove all other columns after mutation, only keeping the new variable.
df5 <- transmute(bank,
duration_trend = duration - mean(duration, na.rm = TRUE),
balance_trend = balance - mean(balance, na.rm = TRUE))mutaterow_number()/min_rank()/ntile(,n)
nrow(distinct(letters, writer))
y <- c(1, 2, 2, NA, 3, 4)
row_number(y)
## [1] 1 2 3 NA 4 5
min_rank(y)
## [1] 1 2 2 NA 4 5
ntile(y, 2)
## [1] 1 1 1 NA 2 2We may write such code.
df <- select(df, x)
df <- mutate(df, a = 1)
df <- rename(df, a = b)
df <- arrange(df, x)
# This is effectively,
arrange(rename(mutate(select(df, x), a = 1), a = b), x)
third(second(first(x)))
How about this?
df %>% select %>% mutate %>% rename %>% arrange
%>% operator allows you to transform the flow from nesting to left-to-right fashion, i.e.
first(x) %>% second() %>% third()
x %>% first() %>% second() %>% third() # this could also do.
x %>% first(.) %>% second(.) %>% third(.) # . represents the input
What’s the output of below?
c(1, 3, 7, 9) %>% {
print(.)
mean(.)
} %>% { . * 3 } %>% {
print(.)
sample(round(., 0))
}
## [1] 1 3 7 9
## [1] 15
## [1] 5 8 13 14 4 11 15 2 9 7 1 12 6 10 3%>% … %>%
# Feed the data for multiple processing
{
v <- .
cn <- colnames(v)
v <- select(v, u, z)
colnames(v) <- cn[1:3]
v
}
# How to return multiple value
%>%
assign("data_name", data, envir = parent.env(environment()) )
} %>% {
select(., z < 0.4)
}
# or, we use list
%>% {
list(a, b)
} %>% {
v <- .
v$a
v$b
}
df %>%
... %>%
... %>%
... %>%
{
v <- .
ggplot(data = v) +
# full data is used here
geom_line(data = v) +
# partial data needs to be hightlighted.
geom_line(data = filter(., some condition), color = "red")
}
Pros:
Cons:
{ print(.); filter(., ...) } to print intermediate resuls.Separate the long pipes into shorter pipes, adding more intermediate variables.
Pipes are fundamentally linear and expressing complex relationships with them will typically yield confusing code.
Environment is where your data resides. Use local() to isolate.
# local stores the data wihtin the boundary of {}
x <- 3
local({
print(x)
x <- 1
print(x)
})
## [1] 3
## [1] 1
print(x)
## [1] 3# local stores the nearest environment
x <- 3
{
print(x)
x <- 1
print(x)
}
## [1] 3
## [1] 1
x
## [1] 1Use assign() to space-jump.
# assign data to global environment
x <- 1
pass_out_global <- function() {
assign("x", 3, envir = .GlobalEnv)
}
# assign data to just one level up
pass_out <- function() {
assign("x", 2, envir = parent.env(environment()))
}x <- 1
pass_out()
x
## [1] 2
# assign data to pass it out of function
extra_layer <- function() {
pass_out()
}
x <- 1
extra_layer()
x
## [1] 2
extra_layer_g <- function() {
pass_out_global()
}
x <- 1
extra_layer_g()
x
## [1] 3